{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0ae013fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/file_utils.ipynb\"\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "61b95d33",
   "metadata": {},
   "source": [
    "# Basic method: spaCy lemma versus spaCy token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f900523e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "birds plural\n"
     ]
    }
   ],
   "source": [
    "text = \"I have five birds\"\n",
    "doc = small_model(text)\n",
    "for token in doc:\n",
    "    if (token.pos_ == \"NOUN\" and token.lemma_ != token.text):\n",
    "        print(token.text, \"plural\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a2321b6",
   "metadata": {},
   "source": [
    "# Number using morph features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "21114315",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Plur']\n"
     ]
    }
   ],
   "source": [
    "doc = small_model(\"I have five birds.\")\n",
    "print(doc[3].morph.get(\"Number\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1acf1f8e",
   "metadata": {},
   "source": [
    "# Function to determine number using spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c3bdf8f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Noun_number(Enum):\n",
    "    SINGULAR = 1\n",
    "    PLURAL = 2\n",
    "\n",
    "def get_nouns_number(text, model, method=\"lemma\"):\n",
    "    nouns = []\n",
    "    doc = model(text)\n",
    "    for token in doc:\n",
    "        if (token.pos_ == \"NOUN\"):\n",
    "            if method == \"lemma\":\n",
    "                if token.lemma_ != token.text:\n",
    "                    nouns.append((token.text, Noun_number.PLURAL))\n",
    "                else:\n",
    "                    nouns.append((token.text, Noun_number.SINGULAR))\n",
    "            elif method == \"morph\":\n",
    "                if token.morph.get(\"Number\") == \"Sing\":\n",
    "                    nouns.append((token.text, Noun_number.PLURAL))\n",
    "                else:\n",
    "                    nouns.append((token.text, Noun_number.SINGULAR))\n",
    "    return nouns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a275f6bb",
   "metadata": {},
   "source": [
    "# Irregular nouns using small model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ae94048e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('geese', <Noun_number.SINGULAR: 1>), ('road', <Noun_number.SINGULAR: 1>)]\n",
      "[('geese', <Noun_number.SINGULAR: 1>), ('road', <Noun_number.SINGULAR: 1>)]\n"
     ]
    }
   ],
   "source": [
    "text = \"Three geese crossed the road\"\n",
    "nouns = get_nouns_number(text, small_model, \"morph\")\n",
    "print(nouns)\n",
    "nouns = get_nouns_number(text, small_model)\n",
    "print(nouns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9f8bc9b",
   "metadata": {},
   "source": [
    "# Irregular nouns using large model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f3b17f70",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('geese', <Noun_number.SINGULAR: 1>), ('road', <Noun_number.SINGULAR: 1>)]\n",
      "[('geese', <Noun_number.PLURAL: 2>), ('road', <Noun_number.SINGULAR: 1>)]\n"
     ]
    }
   ],
   "source": [
    "#!python -m spacy download en_core_web_lg\n",
    "large_model = spacy.load(\"en_core_web_lg\")\n",
    "nouns = get_nouns_number(text, large_model, \"morph\")\n",
    "print(nouns)\n",
    "nouns = get_nouns_number(text, large_model)\n",
    "print(nouns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c03bfeb",
   "metadata": {},
   "source": [
    "# Noun number using GPT-3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4f7b7861",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('geese', 'plural')\n",
      "('road', 'singular')\n"
     ]
    }
   ],
   "source": [
    "from openai import OpenAI\n",
    "client = OpenAI(api_key=OPEN_AI_KEY)\n",
    "prompt=\"\"\"Decide whether each noun in the following text is singular or plural. \n",
    "Return the list in the format of a python tuple: (word, number). Do not provide any additional explanations.\n",
    "Sentence: Three geese crossed the road.\"\"\"\n",
    "response = client.chat.completions.create(\n",
    "    model=\"gpt-3.5-turbo\",\n",
    "    temperature=0,\n",
    "    max_tokens=256,\n",
    "    top_p=1.0,\n",
    "    frequency_penalty=0,\n",
    "    presence_penalty=0,\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ], \n",
    ")\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55b4d5c5",
   "metadata": {},
   "source": [
    "# Converting from singular to plural and plural to singular"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0146905d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['books', 'geese', 'pens', 'points', 'deer']\n",
      "['book', 'goose', 'pen', 'point', 'deer']\n"
     ]
    }
   ],
   "source": [
    "from textblob import TextBlob\n",
    "texts = [\"book\", \"goose\", \"pen\", \"point\", \"deer\"]\n",
    "blob_objs = [TextBlob(text) for text in texts]\n",
    "plurals = [blob_obj.words.pluralize()[0] for blob_obj in blob_objs]\n",
    "print(plurals)\n",
    "blob_objs = [TextBlob(text) for text in plurals]\n",
    "singulars = [blob_obj.words.singularize()[0] for blob_obj in blob_objs]\n",
    "print(singulars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ddada67",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
